#Loading packages
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from IPython.display import HTML
#Loading dataset
df = pd.read_csv("C:/Users/Nathan/Documents/Portfolio/Portfolio Data/World Population EDA/world_population.csv")
#Determining the number of rows and columns in the data
df.shape
(234, 17)
df.head(5)
| Rank | CCA3 | Country | Capital | Continent | 2022 Population | 2020 Population | 2015 Population | 2010 Population | 2000 Population | 1990 Population | 1980 Population | 1970 Population | Area (km²) | Density (per km²) | Growth Rate | World Population Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | AFG | Afghanistan | Kabul | Asia | 41128771 | 38972230 | 33753499 | 28189672 | 19542982 | 10694796 | 12486631 | 10752971 | 652230 | 63.0587 | 1.0257 | 0.52 |
| 1 | 138 | ALB | Albania | Tirana | Europe | 2842321 | 2866849 | 2882481 | 2913399 | 3182021 | 3295066 | 2941651 | 2324731 | 28748 | 98.8702 | 0.9957 | 0.04 |
| 2 | 34 | DZA | Algeria | Algiers | Africa | 44903225 | 43451666 | 39543154 | 35856344 | 30774621 | 25518074 | 18739378 | 13795915 | 2381741 | 18.8531 | 1.0164 | 0.56 |
| 3 | 213 | ASM | American Samoa | Pago Pago | Oceania | 44273 | 46189 | 51368 | 54849 | 58230 | 47818 | 32886 | 27075 | 199 | 222.4774 | 0.9831 | 0.00 |
| 4 | 203 | AND | Andorra | Andorra la Vella | Europe | 79824 | 77700 | 71746 | 71519 | 66097 | 53569 | 35611 | 19860 | 468 | 170.5641 | 1.0100 | 0.00 |
The dataset contains population information on 234 different countries and territories within 17 variables:
Rank : Ranking from 1 to 234 for each country/territory based on current population sizeCCA3 : The three digit country/territory code associated with each country/territoryCountry : The name of the country/territoryCapital : The name of the capital for the given country/territoryContinent : The name of the continent for the given country/territory (Asia, Africa, Europe, North America, South America, and Oceania)2022 Population : The 2022 population of the given country/territory2020 Population : The 2020 population of the given country/territory2015 Population : The 2015 population of the given country/territory2010 Population : The 2010 population of the given country/territory2000 Population : The 2000 population of the given country/territory1990 Population : The 1990 population of the given country/territory1980 Population : The 1980 population of the given country/territory 1970 Population : The 1970 population of the given country/territory Area (km²) : The land mass of the given country/territory in km²Denisty (per km²) : The population density of the given country/territory in people per km²Growth Rate : The current population growth rate of the given country/territoryWorld Population Percentage : The percentage of the total world population residing in the given country/territoryStart by ensuring that every column is complete and contains no missing values. This can be done by counting the number of instances that occur in each variable column and comparing it with the known length of the dataset.
A quick and easy way to do this is by generating a bar chart.
#Check each column in the data for missing values
missing = df.notnull().sum(axis=0)
#Generate Bar plot for each variable in the dataset counting the number of non-missing instances
fig = px.bar(x=missing.index,
y=missing.values,
text=missing.values, #include the number of non-missing values inside each bar on the plot for clarity
title='World Population Dataset: Total Number of Data Points (out of 234 rows)'
)
fig.update_traces(hovertemplate='<br> Variable: %{x} </br> Number of Instances: %{y}')
fig.update_layout(xaxis_title='Dataset Variables',yaxis_title='Number of Instances')
fig.show(renderer='notebook')
Since every column contains 234 instances, we can assume there are no missing values in the data.
To better understand the structure of the data, we may wish to know how many countries and territories are being accounted for in each continent. This can also be done using a bar chart by grouping and counting the number of Country instances within each Continent level.
#Count the number of countries/territories in each continent
country_counts = df.groupby('Continent')['Country'].count()
#Generate
fig = px.bar(x=country_counts.values,
y=country_counts.index,
color=country_counts.index, #add color to easily differentiate between each continent
text=country_counts.values, #include the number of countries/territories in each continent on bar plot
color_discrete_sequence=px.colors.sequential.Peach[::-1],
title='World Population Data: Number of Countries & Territories per Continent')
fig.update_traces(hovertemplate='<br> Continent: %{y} </br> Number of Countries Included: %{x}') #edit hovertext for clarity
fig.update_layout(xaxis_title='Number of Countries & Territories',yaxis_title='Continent')
fig.show()
The data contains two variables which have a clear link to one another as well as a population: Area (km²) and Density (per km²)
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='Area (km²)',
color_continuous_scale=px.colors.sequential.Bluyl,
title='Total Land Mass by Country',
template='ggplot2',
projection='natural earth'
)
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
fig.show()
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='Density (per km²)',
color_continuous_scale=px.colors.sequential.OrRd,
title='Total Population Density by Country',
template='ggplot2',
range_color=(0,150),
projection='natural earth'
)
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
example = fig
fig.show()
A good starting point for understanding the world population would be by visualizing the most recent 2022 Population.
#Find the average population by continent and sort values in a descending order
continent_populations = df.groupby('Continent')['2022 Population'].mean().round().sort_values(ascending=False)
#Generate a bar plot to visual each continent's average population
fig=px.bar(x=continent_populations.index,
y=continent_populations.values,
color=continent_populations.index, #color each continents bar according to its population
text=continent_populations.values, #include average population amount on each bar for clarity
color_discrete_sequence=px.colors.sequential.Darkmint[::-1],
title='Average 2022 Population by Continent'
)
fig.update_layout(xaxis_title='Continents',
yaxis_title='Average Population Count')
fig.update_traces(hovertemplate='<br> Continent: %{x} </br> Average Population: %{y}') #update hover information for inspection
fig.show()
Aside from the current average world population numbers, inspecting average population numbers from the past few decades could help identify any apparent traits. From the data, we can pull the 2020 Population,2010 Population,2000 Population,1990 Population,1980 Population,and 1970 Population numbers and generate similar plots for reference.
years = ['2020 Population','2010 Population', '2000 Population', '1990 Population','1980 Population', '1970 Population']
#Calculate the average country/territory population for each continent
cont_pop_years = df.groupby('Continent').mean().round()
#Drop unneeded variables
cont_pop_years = cont_pop_years.drop(['Area (km²)', 'Density (per km²)', 'Growth Rate','World Population Percentage'],axis=1)
#Reintroduce continent are variable
cont_pop_years.reset_index(inplace=True)
#Combine all the population year variables into two variables: one containing the year and one containing the corresponding pop.
cont_pop_years = pd.melt(cont_pop_years,id_vars=['Continent'],value_vars=years)
cont_pop_years = cont_pop_years.rename(columns={'variable':'Year','value':'Population'})
#Remove ' Population' from Year variable and just leave year
cont_pop_years['Year'] = cont_pop_years['Year'].str.replace(' Population','')
#Reorganize the order of the data to match earlier plot to see progression more easily
cat = ['Asia','South America','Africa','North America','Europe','Oceania']
cont_pop_years['Continent'] = pd.Categorical(cont_pop_years['Continent'],categories=cat)
cont_pop_years = cont_pop_years.sort_values(by=['Continent','Year'])
#Generate animated bar plot of average country population by continent the change over decades
fig=px.bar(cont_pop_years,
x='Continent',
y='Population',
color='Continent',
color_discrete_sequence=px.colors.sequential.Darkmint[::-1],
title='Average Population by Continent: 1970 - 2020',
animation_frame='Year',
range_y=[0,95000000]
)
#Rename axes and hover infromation for clarity
fig.update_layout(xaxis_title='Continents', yaxis_title='Population Count')
fig.update_traces(hovertemplate='<br> Continent: %{x} </br> Average Population: %{y}')
fig.show()
#Generate a pie chart to visualize the proportion each continent contributes to the current total world population
fig = px.pie(df,
values='2022 Population',
names='Continent',
color_discrete_sequence=px.colors.sequential.Magenta[::-1],
title='Total 2022 Population by Continent',
hole=0.25
)
fig.update_traces(textinfo='label+percent+value') #update each slices information for better readability
fig.show()
ALso using the 2020 Population,2010 Population,2000 Population,1990 Population,1980 Population,and 1970 Population numbers, we can make visuals for the total world population numbers and see how certain continents and their overall contributions have shifted over time.
populations = [['1970 Population','1980 Population'],['1990 Population','2000 Population'],
['2010 Population','2020 Population']]
i=1
fig = make_subplots(rows=3,cols=2,specs=[[{'type':'domain'},{'type':'domain'}],[{'type':'domain'},{'type':'domain'}],
[{'type':'domain'},{'type':'domain'}]],
subplot_titles = ['Total 1970 Population','Total 1980 Population','Total 1990 Population',
'Total 2000 Population','Total 2010 Population','Total 2020 Population'],
horizontal_spacing=0.3,
column_widths=[0.5,0.5],vertical_spacing=0)
for pop in populations:
fig.add_trace(
go.Pie(values = df.groupby('Continent')[pop[0]].sum().sort_values(ascending=False),
labels = df.groupby('Continent')[pop[0]].sum().sort_values(ascending=False).index,
name='',
marker=dict(colors=px.colors.sequential.Magenta[::-1]),
texttemplate='<br>%{label}</br>%{percent}</br>%{value}',
textposition='outside',
pull = [0,0.1,0.1,0,0,0]
),
row=i,
col=1
)
fig.add_trace(
go.Pie(values = df.groupby('Continent')[pop[1]].sum().sort_values(ascending=False),
labels = df.groupby('Continent')[pop[1]].sum().sort_values(ascending=False).index,
name='',
marker=dict(colors=px.colors.sequential.Magenta[::-1]),
texttemplate='<br>%{label}</br>%{percent}</br>%{value}',
textposition='outside',
pull = [0,0.1,0.1,0,0,0]
),
row=i,
col=2
)
i+=1
fig.update_layout(height=1500,showlegend=False,title='Total World Population: 1970 - 2020')
fig.show()
After looking at each continent as a whole, we can identify which countries and territories have contributed the most and least to the current world population.
#Calculate the top 5 most populated countries/territories in 2022
mostpop_country = df.groupby('Country')['2022 Population'].sum().sort_values(ascending=False).head(5)
#Generate a bar plot of the top 5 most populated countries of 2022
fig = px.bar(x=mostpop_country.index,
y=mostpop_country.values,
color_discrete_sequence = ['MidnightBlue'],
text=mostpop_country.values,
title='Top 5 Most Populated Countries According to 2022 Population'
)
#Update axis titles for clarity
fig.update_layout(xaxis_title='Country', yaxis_title='Population')
#Update hover information for addition clarity
fig.update_traces(hovertemplate='<br> Country: %{x} </br> 2022 Population: %{y}')
fig.show()
#Calculate the top 5 least populated countries/territories of 2022
leastpop_country = df.groupby('Country')['2022 Population'].sum().sort_values(ascending=True).head(5)
leastpop_country = leastpop_country.sort_values(ascending=False)
#Generate a bar plot of the top 5 least populated countries/territories of 2022
fig = px.bar(x=leastpop_country.index,
y=leastpop_country.values,
color_discrete_sequence = ['IndianRed'],
text=leastpop_country.values,
title= 'Top 5 Least Populated Countires According to 2022 Population'
)
#Update axis titles for clarity
fig.update_layout(xaxis_title='Country', yaxis_title='Population')
#Update hover information for additional clarity
fig.update_traces(hovertemplate='<br> Country: %{x} </br> 2022 Population: %{y}')
fig.show()
We can try to determine if there are any changes in these rankings by looking at the most and least populated countries and territories of the past 50 years.
#Generate list of decades to cycle through for plotting
populations = ['2020 Population','2010 Population','2000 Population','1990 Population','1980 Population','1970 Population']
#Set list to chronological order
populations.reverse()
#Using a for loop, plot the top 5 least and most populated countries/territories for each decade over the past 50 years
for pop in populations:
#Calculate the least and most populated countries/territories of each decade
mostpop_country = df.groupby('Country')[pop].sum().sort_values(ascending=False).head(5)
leastpop_country = df.groupby('Country')[pop].sum().sort_values(ascending=True).head(5)
leastpop_country = leastpop_country.sort_values(ascending=False)
#Generate subplot matrix for each decade
fig = make_subplots(rows=1,cols=2,specs=[[{'type':'xy'},{'type':'xy'}]],
subplot_titles=['Top 5 Most Populated Countries','Top 5 Least Populated Countries'],
y_title='Population', x_title='Countries')
#Generate bar chart for top 5 most populated countries of the decade
fig.add_trace(
go.Bar(x=mostpop_country.index,
y=mostpop_country.values,
texttemplate='%{y}',
name='',
showlegend=False,
marker=dict(color='MidnightBlue')
),
row=1,
col=1
)
#Generate bar chart for top 5 least populate countries of the decade
fig.add_trace(
go.Bar(x=leastpop_country.index,
y=leastpop_country.values,
texttemplate='%{y}',
name='',
showlegend=False,
marker=dict(color='IndianRed')
),
row=1,
col=2
)
#Include a main title indicating which decade the subplots are refering to
fig.update_layout(title_text=pop)
fig.show()
Since the data contains information on the world population broken down by each country and territory's contribution, it may be easier to visualize each countries impact by viewing the information on a choropleth map. This makes it possible to view the aggregated population data in a geographic way.
Lets start by looking at a choropleth map of the current 2022 Population:
#Generate a choropleth map to visual the current world population
fig = px.choropleth(data_frame=df,
locations='Country',
locationmode='country names',
color='2022 Population',
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000),
title='World Map Indicating 2022 Populations',
template='ggplot2',
projection='natural earth')
#Set up coloration and grids for clarity on borders and position
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
We can also view a choropleth map of the world population over the past 50 years and see how the population numbers change over the decades through an animation.
#List of the past 5 decades to inspect
years = ['2020 Population','2010 Population', '2000 Population', '1990 Population','1980 Population', '1970 Population']
#Reverse order to see oldest to most recent
years.reverse()
#Copy data
country_pop_year = df
#Drop irrelevant columns
country_pop_year = country_pop_year.drop(['Area (km²)', 'Density (per km²)', 'Growth Rate','World Population Percentage'],
axis=1)
#Reconfigure data to have all different populations year data in one column with another column 'year' as identifier
country_pop_year = pd.melt(country_pop_year,id_vars=['Country'],value_vars=years)
#Rename new data set columns to match contents
country_pop_year = country_pop_year.rename(columns={'variable':'Year','value':'Population'})
#Remove ' Population' from indentifier column so it only include year (1970, 1980, etc.)
country_pop_year['Year'] = country_pop_year['Year'].str.replace(' Population','')
#Generate a choropleth animation showing the change in country/territory populations for each decade from 1970-2020
fig = px.choropleth(data_frame=country_pop_year,
locations='Country', #Column used to identify countries
locationmode='country names',
color='Population', #Column used to identify color intensity
animation_frame='Year', #Column used to idenify each animation frame
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000), #set limit on colorscale, India and China are so large the rest seem unchanged
title='World Map Indicating Populations: 1970 - 2020',
template='ggplot2',
projection='natural earth') #change map projection type to Winkel Tripel
#Include longitude, latitude, and water markings for clarify and visual appeal
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
Since roughly half of the current world population reside in China, India, and Pakistan, it makes sense to isolate Asia and take a closer look at their changes over the past 50 years.
#Generate the same choropleth map as before, this time changing the scope of the map from the entire world to only Asia
fig = px.choropleth(data_frame=country_pop_year,
locations='Country',
locationmode='country names',
color='Population',
animation_frame='Year',
color_continuous_scale=px.colors.sequential.RdBu[::-1],
range_color=(0,1000000000),
title='Asia Population Density Map: 1970 - 2020',
template='ggplot2',
scope = 'asia')
fig.update_geos(lataxis_showgrid=True,
lonaxis_showgrid=True,
showocean=True, oceancolor="LightBlue",
showlakes=True, lakecolor="LightBlue")
fig.show()